In [1]:
import pandas as pd
import numpy as np
In [7]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float,
'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str,
'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float,
'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
In [311]:
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
In [312]:
def get_numpy_data(data_sframe, features, output):
feature_matrix = data_sframe[features].values
output_array = data_sframe[[output]].values
return (feature_matrix, output_array)
In [313]:
def normalize_features(features):
norms = np.linalg.norm(features, axis=0)
return (features/norms, norms)
In [314]:
feature_list = ['bedrooms',
'bathrooms',
'sqft_living',
'sqft_lot',
'floors',
'waterfront',
'view',
'condition',
'grade',
'sqft_above',
'sqft_basement',
'yr_built',
'yr_renovated',
'lat',
'long',
'sqft_living15',
'sqft_lot15']
my_features = list(dtype_dict.keys()- ['id', 'date', 'zipcode', 'price'])
features_train,output_train = get_numpy_data(train, feature_list, 'price')
features_valid,output_valid = get_numpy_data(validation, feature_list, 'price')
features_test,output_test = get_numpy_data(test, feature_list, 'price')
In [315]:
set(my_features) - set(feature_list)
Out[315]:
In [316]:
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms
In [317]:
print(features_test[0])
print(features_train[9])
In [318]:
import math
In [319]:
def get_distance(vec1, vec2):
return math.sqrt(np.sum((vec1 - vec2)**2))
In [320]:
get_distance(features_test[0], features_train[9])
Out[320]:
In [321]:
min_distance = None
closest_house = None
for i, train_house in enumerate(features_train[0:10]):
dist = get_distance(features_test[0], train_house)
if i == 0 or dist < min_distance:
min_distance = dist
closest_house = i
In [322]:
print(min_distance)
print(closest_house)
In [323]:
diff = features_train - features_test[0]
In [324]:
np.sum(diff[-1], axis=0)
Out[324]:
In [325]:
dist = np.sqrt(np.sum(diff**2, axis=1))
In [326]:
dist[100]
Out[326]:
In [327]:
def compute_distances(features_instances, features_query):
diff = features_instances - features_query
distances = np.sqrt(np.sum(diff**2, axis=1))
return distances
In [328]:
distances = compute_distances(features_train, features_test[2])
print(distances)
print(np.argmin(distances))
In [296]:
np.where(distances == min(distances))
Out[296]:
In [297]:
distances[1149]
Out[297]:
In [298]:
def k_nearest_neighbors(k, feature_train, features_query):
distances = compute_distances(features_train, features_query)
return distances, np.argsort(distances)[:k]
In [299]:
distances, neighbours = k_nearest_neighbors(4, features_train, features_test[2])
In [300]:
for n in neighbours:
print(distances[n])
print(neighbours)
In [301]:
print(neighbours)
In [302]:
def predict_output_of_query(k, features_train, output_train, features_query):
distances, neighbours = k_nearest_neighbors(k, features_train, features_query)
prediction = output_train[neighbours].mean()
return prediction
In [303]:
predict_output_of_query(1, features_train, output_train, features_test[2])
Out[303]:
In [304]:
predict_output_of_query(4, features_train, output_train, features_test[2])
Out[304]:
In [305]:
print(output_test[2])
In [306]:
def predict_output(k, features_train, output_train, features_query):
#distances, neighbours = k_nearest_neighbors(k, features_train, features_query)
predictions = np.zeros((features_query.shape[0], 1))
for i in range(features_query.shape[0]):
predictions[i,0] = predict_output_of_query(k,features_train, output_train, features_query[i])
return predictions
In [307]:
predictions = predict_output(10, features_train, output_train, features_test[:10])
print(predictions)
print(np.argmin(predictions))
In [308]:
print(output_test[:10])
In [309]:
rsss = []
for k in range(1,16):
predictions = predict_output(k, features_train, output_train, features_valid)
error = predictions - output_valid
rss = error.T.dot(error)
print('RSS for k=%s: %s' % (k, rss))
rsss.append(rss)
In [310]:
predictions = predict_output(3, features_train, output_train, features_test)
error = predictions - output_test
rss = error.T.dot(error)
print(rss)
In [ ]: